import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.mllib.linalg
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.rdd.RDD
import scala.io.Source
 
object KmeansSpark {
 
  def main(args: Array[String]): Unit = {
 
    //在本地启动Spark
    val sparkConf = new SparkConf().setAppName("KmeansSpark").setMaster("local")
    val sc = new SparkContext(sparkConf)
 
    //加载本地文件数据形成RDD
    val data = sc.textFile("file:///usr/share/synthetic_control.data")
    val parsedData: RDD[linalg.Vector] = data.map(sc=>{
      val values: Array[Double] = sc.split("\\s+").map(x => x.toDouble)
      Vectors.dense(values)
    })
 
    //聚类中心个数
    val numClusters = 8
    //算法迭代次数
    val numIterations = 10
    //算法运行次数
    val runs = 5
    //KMeans训练
    val KMTresult:KMeansModel = KMeans.train(parsedData, numClusters, numIterations, runs)
 
    println("Cluster Number:" + KMTresult.clusterCenters.length)
	//打印聚类中心ID
	var clusterIndex: Int = 0
	println("Cluster Centers Information Overview:")
    KMTresult.clusterCenters.foreach(
		x=>{
			println("Center Point of Cluster " + clusterIndex + ":")
			println(x)
			clusterIndex += 1			
    })
    //打印数据归属哪个聚类中心ID
    parsedData.map(v => v.toString + " belong to cluster: " +KMTresult.predict(v))
    sc.foreach(x=>
      println(x)
    )
    sc.stop()  
  }
}

